countMatrix <- ReadDataFrameFromTsv(file.name.path="../data/refSEQ_countMatrix.txt")
## ../data/refSEQ_countMatrix.txt read from disk!
# head(countMatrix)
designMatrix <- ReadDataFrameFromTsv(file.name.path="../design/all_samples_short_names.tsv.csv")
## ../design/all_samples_short_names.tsv.csv read from disk!
# head(designMatrix)
rownames <- as.character(rownames(countMatrix))
rownames <- rownames[order(rownames)]
rownames.map <- convertGenesViaBiomart(specie="mm10", filter="entrezgene",
filter.values=rownames, attrs=c("external_gene_name",
"mgi_symbol", "entrezgene"))
noNaCountMatrix <- attachGeneColumnToDf(mainDf=countMatrix,
genesMap=rownames.map,
rowNamesIdentifier="entrezgene",
mapFromIdentifier="entrezgene",
mapToIdentifier="external_gene_name")
filteredCountsProp <- filterLowCounts(counts.dataframe=noNaCountMatrix,
is.normalized=FALSE,
design.dataframe=designMatrix,
cond.col.name="gcondition",
method.type="Proportion")
## features dimensions before normalization: 27179
## Filtering out low count features...
## 14531 features are to be kept for differential expression analysis with filtering method 3
pc1_2 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(filteredCountsProp), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC2", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="Prop-Un-Norm")
## [1] TRUE
pc2_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(filteredCountsProp), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC2", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="Prop-Un-Norm")
## [1] TRUE
pc1_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(filteredCountsProp), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="Prop-Un-Norm")
## [1] TRUE
plotly::subplot(pc1_2, pc2_3, pc1_3, nrows=2, margin = 0.1, titleX=TRUE, titleY=TRUE)
normPropCountsUqua <- NormalizeData(data.to.normalize=filteredCountsProp,
norm.type="uqua",
design.matrix=designMatrix)
pc1_2 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normPropCountsUqua), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC2", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA-Norm")
## [1] TRUE
pc2_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normPropCountsUqua), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC2", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA-Norm")
## [1] TRUE
pc1_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normPropCountsUqua), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA-Norm")
## [1] TRUE
plotly::subplot(pc1_2, pc2_3, pc1_3, nrows=2, margin = 0.1, titleX=TRUE, titleY=TRUE)
pal <- RColorBrewer::brewer.pal(9, "Set1")
plotRLE(as.matrix(normPropCountsUqua), outline=FALSE, col=pal[designMatrix$gcondition])
Estimating Negative Control Genes to normalize data
#### estimating neg controls
# options(stringsAsFactors=TRUE)
library(readxl)
rs2.ctrls <- read_excel(path="../data/controls/Additional File 4 full list of BMC genomics SD&RS2.xlsx", sheet=3)
rs2.ctrls <- rs2.ctrls[order(rs2.ctrls$adj.P.Val),]
# ctrls <- read.csv(file="data/controls/sd_controls_NA.csv", header=TRUE, sep="\t", quote="")
# ctrls <- ctrls[order(ctrls$adj.P.Val, decreasing=TRUE), ]
# head(rs2.ctrls)
# tail(rs2.ctrls)
rs2.neg.ctrls <- rs2.ctrls[rs2.ctrls$adj.P.Val > 0.9, ]
rs2.neg.ctrls <- rs2.neg.ctrls$`MGI Symbol`
rs2.neg.ctrls <- rs2.neg.ctrls[-which(is.na(rs2.neg.ctrls))]
sd.ctrls <- read_excel(path="../data/controls/Additional File 4 full list of BMC genomics SD&RS2.xlsx",sheet=1)
sd.ctrls <- sd.ctrls[order(sd.ctrls$adj.P.Val),]
# ctrls <- read.csv(file="data/controls/sd_controls_NA.csv", header=TRUE, sep="\t", quote="")
# ctrls <- ctrls[order(ctrls$adj.P.Val, decreasing=TRUE), ]
# head(sd.ctrls)
# tail(sd.ctrls)
sd.pos.ctrls <- sd.ctrls[sd.ctrls$adj.P.Val < 0.01, ]
sd.neg.ctrls <- sd.ctrls[sd.ctrls$adj.P.Val > 0.9, ]
sd.neg.ctrls <- sd.neg.ctrls$`MGI Symbol`
sd.neg.ctrls <- sd.neg.ctrls[-which(is.na(sd.neg.ctrls))]
int.neg.ctrls <- intersect(rs2.neg.ctrls, sd.neg.ctrls)
neg.map <- convertGenesViaBiomart(specie="mm10", filter="mgi_symbol",
filter.values=int.neg.ctrls, c("external_gene_name",
"mgi_symbol", "entrezgene"))
neg.map.nna <- neg.map[-which(is.na(neg.map$entrezgene)),]
# neg.map.nna <- neg.map.nna[which(neg.map.nna$entrezgene %in% rownames(normPropCountsUqua)),]
neg.ctrls.entrez <- as.character(neg.map.nna$entrezgene)
# length(neg.ctrls.entrez)
# normPropCountsUqua.neg <- attachGeneColumnToDf(mainDf=normPropCountsUqua,
# genesMap=neg.map.nna,
# rowNamesIdentifier="entrezgene",
# mapFromIdentifier="entrezgene",
# mapToIdentifier="external_gene_name")
# normPropCountsUqua.neg[which(!is.na(normPropCountsUqua.neg$check)),]
ind.ctrls <- which(rownames(normPropCountsUqua) %in% neg.ctrls.entrez)
counts.neg.ctrls <- normPropCountsUqua[ind.ctrls,]
pc1_2 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(counts.neg.ctrls), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC2", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA-Norm")
## [1] TRUE
pc2_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(counts.neg.ctrls), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC2", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA-Norm")
## [1] TRUE
pc1_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(counts.neg.ctrls), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA-Norm")
## [1] TRUE
plotly::subplot(pc1_2, pc2_3, pc1_3, nrows=2, margin = 0.1, titleX=TRUE, titleY=TRUE)
K=5
library(RUVSeq)
#groups <- makeGroups(designMatrix$classic)[1,,drop=FALSE]
neg.ctrl.list <- rownames(counts.neg.ctrls)
# neg.ctrl.list <- as.character(neg.map.nna$entrezgene[which(neg.map.nna$entrezgene %in% rownames(normPropCountsUqua))])
groups <- makeGroups(paste0(designMatrix$genotype, designMatrix$classic))[c(1, 3),]
ruvedSExprData <- RUVs(as.matrix(round(normPropCountsUqua)), cIdx=neg.ctrl.list,
scIdx=groups, k=5)
normExprData <- ruvedSExprData$normalizedCounts
pc1_2 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normExprData), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC2", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA+RUV-Norm")
## [1] TRUE
pc2_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normExprData), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC2", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA+RUV-Norm")
## [1] TRUE
pc1_3 <- PlotPCAPlotlyFunction(counts.data.frame=log1p(normExprData), design.matrix=designMatrix, shapeColname="genotype", colorColname="condition", xPCA="PC1", yPCA="PC3", plotly.flag=TRUE, show.plot.flag=FALSE, prefix.plot="UQUA+RUV-Norm")
## [1] TRUE
plotly::subplot(pc1_2, pc2_3, pc1_3, nrows=2, margin = 0.1, titleX=TRUE, titleY=TRUE)
pal <- RColorBrewer::brewer.pal(9, "Set1")
plotRLE(normExprData, outline=FALSE, col=pal[designMatrix$gcondition])
### edgering
desMat <- cbind(designMatrix, ruvedSExprData$W)
colnames(desMat) <- c(colnames(designMatrix), colnames(ruvedSExprData$W))
cc <- c("WTSD5 - WTHC5", "WTRS2 - WTHC7", "KOHC5 - WTHC5", "KOHC7 - WTHC7")
rescList1 <- applyEdgeR(counts=normPropCountsUqua, design.matrix=desMat,
factors.column="gcondition",
weight.columns=c("W_1", "W_2", "W_3", "W_4"),
contrasts=cc, useIntercept=FALSE, p.threshold=1,
verbose=TRUE)
PlotHistPvalPlot(de.results=rescList1[[1]], design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE,
prefix.plot=names(rescList1)[1])
# sd.pos.ctrls <- ReadDataFrameFromTsv(file.name.path="../data/SD_full_pos_control_genes_BMC_genomics.csv")
# sd.pos.ctrls <- sd.pos.ctrls$MGI.Symbol
# sd.pos.ctrls <- sd.pos.ctrls[-which(is.na(sd.pos.ctrls))]
#
# sd.pos.map <- convertGenesViaBiomart(specie="mm10", filter="mgi_symbol",
# filter.values=sd.pos.ctrls, c("external_gene_name",
# "mgi_symbol", "entrezgene"))
#
# sd.pos.map.nna <- sd.pos.map[-which(is.na(sd.pos.map$entrezgene)),]
#
# sd.pos.genes.entrez <- as.character(sd.pos.map.nna$entrezgene)
## mapping ensembl gene id using biomart
sd.pos.ctrls <- sd.pos.ctrls$`MGI Symbol`
sd.pos.ctrls <- sd.pos.ctrls[-which(is.na(sd.pos.ctrls))]
# length(sd.pos.ctrls)
res.o.map <- convertGenesViaBiomart(specie="mm10", filter="entrezgene",
filter.values=rownames(rescList1[[1]]),
c("external_gene_name", "mgi_symbol", "entrezgene"))
WriteDataFrameAsTsv(data.frame.to.save=rescList1[[1]],
file.name.path=paste0(names(rescList1)[1], "edgeR"))
res.o <- attachGeneColumnToDf(mainDf=rescList1[[1]],
genesMap=res.o.map,
rowNamesIdentifier="entrezgene",
mapFromIdentifier="entrezgene",
mapToIdentifier="external_gene_name")
PlotVolcanoPlot(de.results=res.o, counts.dataframe=normExprData,
design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE,
prefix.plot=names(rescList1)[1], threshold=0.05,
positive.ctrls.list=sd.pos.ctrls)
## [1] 564
de <- sum(res.o$FDR < 0.05)
nde <- sum(res.o$FDR >= 0.05)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[1]
ddetable <- detable
# PlotMAPlotCounts(de.results=res.o, counts.dataframe=normExprData, design.matrix=desMat,
# show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE, prefix.plot=names(rescList1)[1], threshold=0.05)
PlotHistPvalPlot(de.results=rescList1[[2]], design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE,
prefix.plot=names(rescList1)[2])
# rs2.pos.ctrls <- ReadDataFrameFromTsv(file.name.path="../data/RS2_pos_control_genes_BMC_genomics.csv")
# rs2.ctrls <- read_excel(path="data/controls/Additional File 4 full list of BMC genomics SD&RS2.xlsx", sheet=3)
#
#
# rs2.ctrls <- rs2.ctrls[order(rs2.ctrls$adj.P.Val),]
# # ctrls <- read.csv(file="data/controls/sd_controls_NA.csv", header=TRUE, sep="\t", quote="")
# # ctrls <- ctrls[order(ctrls$adj.P.Val, decreasing=TRUE), ]
# head(rs2.ctrls)
# tail(rs2.ctrls)
# rs2.neg.ctrls <- rs2.ctrls[rs2.ctrls$adj.P.Val > 0.9, ]
rs2.pos.ctrls <- rs2.ctrls[rs2.ctrls$adj.P.Val < 0.01, ]
rs2.pos.ctrls <- rs2.pos.ctrls$`MGI Symbol`
rs2.pos.ctrls <- rs2.pos.ctrls[-which(is.na(rs2.pos.ctrls))]
# rs2.neg.ctrls <- rs2.neg.ctrls[-which(is.na(rs2.neg.ctrls))]
#
# rs2.neg.map <- convertGenesViaBiomart(specie="mm10", filter="mgi_symbol",
# filter.values=sd.neg.ctrls, c("external_gene_name",
# "mgi_symbol", "entrezgene"))
#
# rs2.neg.map.nna <- rs2.neg.map[-which(is.na(rs2.neg.map$entrezgene)),]
#
# # neg.map.nna <- neg.map.nna[which(neg.map.nna$entrezgene %in% rownames(normPropCountsUqua)),]
# #
# rs2.neg.ctrls.entrez <- as.character(rs2.neg.map.nna$entrezgene)
#
# rs2.pos.ctrls <- rs2.pos.ctrls$MGI.Symbol
# rs2.pos.ctrls <- rs2.pos.ctrls[-(which(is.na(rs2.pos.ctrls)))]
# rs2.pos.map <- convertGenesViaBiomart(specie="mm10", filter="mgi_symbol",
# filter.values=rs2.pos.ctrls, c("external_gene_name",
# "mgi_symbol", "entrezgene"))
#
# rs2.pos.map.nna <- rs2.pos.map[-which(is.na(rs2.pos.map$entrezgene)),]
# rs2.pos.genes.entrez <- rs2.pos.map.nna$entrezgene
rs2.o.map <- convertGenesViaBiomart(specie="mm10", filter="entrezgene",
filter.values=rownames(rescList1[[2]]),
c("external_gene_name", "mgi_symbol", "entrezgene"))
WriteDataFrameAsTsv(data.frame.to.save=rescList1[[2]],
file.name.path=paste0(names(rescList1)[2], "edgeR"))
res.rs2.o <- attachGeneColumnToDf(mainDf=rescList1[[2]],
genesMap=rs2.o.map,
rowNamesIdentifier="entrezgene",
mapFromIdentifier="entrezgene",
mapToIdentifier="external_gene_name")
PlotVolcanoPlot(de.results=res.rs2.o, counts.dataframe=normExprData,
design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE,
prefix.plot=names(rescList1)[2], threshold=0.05,
positive.ctrls.list=rs2.pos.ctrls)
## [1] 159
de <- sum(res.rs2.o$FDR < 0.05)
nde <- sum(res.rs2.o$FDR >= 0.05)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[2]
ddetable <- rbind(ddetable, detable)
# PlotMAPlotCounts(de.results=res.o, counts.dataframe=normExprData, design.matrix=desMat,
# show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE, prefix.plot=names(rescList1)[2], threshold=0.05)
PlotHistPvalPlot(de.results=rescList1[[3]], design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE,
prefix.plot=names(rescList1)[3])
## mapping ensembl gene id using biomart
res.o.map <- convertGenesViaBiomart(specie="mm10", filter="entrezgene",
filter.values=rownames(rescList1[[3]]),
c("external_gene_name", "mgi_symbol", "entrezgene"))
WriteDataFrameAsTsv(data.frame.to.save=rescList1[[3]],
file.name.path=paste0(names(rescList1)[3], "_edgeR"))
res.o <- attachGeneColumnToDf(mainDf=rescList1[[3]],
genesMap=res.o.map,
rowNamesIdentifier="entrezgene",
mapFromIdentifier="entrezgene",
mapToIdentifier="external_gene_name")
PlotVolcanoPlot(de.results=res.o, counts.dataframe=normExprData,
design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE,
prefix.plot=names(rescList1)[4], threshold=0.05,
positive.ctrls.list=NULL)
de <- sum(res.o$FDR < 0.05)
nde <- sum(res.o$FDR >= 0.05)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[3]
ddetable <- rbind(ddetable, detable)
# PlotMAPlotCounts(de.results=res.o, counts.dataframe=normExprData, design.matrix=desMat,
# show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE, prefix.plot=names(rescList1)[1], threshold=0.05)
PlotHistPvalPlot(de.results=rescList1[[4]], design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE,
prefix.plot=names(rescList1)[4])
## mapping ensembl gene id using biomart
res.o.map <- convertGenesViaBiomart(specie="mm10", filter="entrezgene",
filter.values=rownames(rescList1[[4]]),
c("external_gene_name", "mgi_symbol", "entrezgene"))
WriteDataFrameAsTsv(data.frame.to.save=rescList1[[4]],
file.name.path=paste0(names(rescList1)[4], "_edgeR"))
res.o <- attachGeneColumnToDf(mainDf=rescList1[[4]],
genesMap=res.o.map,
rowNamesIdentifier="entrezgene",
mapFromIdentifier="entrezgene",
mapToIdentifier="external_gene_name")
PlotVolcanoPlot(de.results=res.o, counts.dataframe=normExprData,
design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE,
prefix.plot=names(rescList1)[4], threshold=0.05,
positive.ctrls.list=NULL)
de <- sum(res.o$FDR < 0.05)
nde <- sum(res.o$FDR >= 0.05)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[4]
ddetable <- rbind(ddetable, detable)
# PlotMAPlotCounts(de.results=res.o, counts.dataframe=normExprData, design.matrix=desMat,
# show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE, prefix.plot=names(rescList1)[1], threshold=0.05)
ddetable
## de nde
## WTSD5 - WTHC5 4823 9708
## WTRS2 - WTHC7 3379 11152
## KOHC5 - WTHC5 11 14520
## KOHC7 - WTHC7 14 14517
### edgering
newDesMat <- cbind(desMat, paste0(desMat$genotype, desMat$classic))
colnames(newDesMat) <- c(colnames(desMat), "genclass")
cc <- c("KOCTRL - WTCTRL")
rescList1 <- applyEdgeR(counts=normPropCountsUqua, design.matrix=newDesMat,
factors.column="genclass",
weight.columns=c("W_1", "W_2", "W_3", "W_4"),
contrasts=cc, useIntercept=FALSE, p.threshold=1,
verbose=TRUE)
PlotHistPvalPlot(de.results=rescList1[[1]], design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE,
prefix.plot=names(rescList1)[1])
## mapping ensembl gene id using biomart
res.o.map <- convertGenesViaBiomart(specie="mm10", filter="entrezgene",
filter.values=rownames(rescList1[[1]]),
c("external_gene_name", "mgi_symbol", "entrezgene"))
WriteDataFrameAsTsv(data.frame.to.save=rescList1[[1]],
file.name.path=paste0(names(rescList1)[1], "_edgeR"))
res.o <- attachGeneColumnToDf(mainDf=rescList1[[1]],
genesMap=res.o.map,
rowNamesIdentifier="entrezgene",
mapFromIdentifier="entrezgene",
mapToIdentifier="external_gene_name")
PlotVolcanoPlot(de.results=res.o, counts.dataframe=normExprData,
design.matrix=desMat,
show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE,
prefix.plot=names(rescList1)[1], threshold=0.05,
positive.ctrls.list=NULL)
de <- sum(res.o$FDR < 0.05)
nde <- sum(res.o$FDR >= 0.05)
detable <- cbind(de,nde)
rownames(detable) <- names(rescList1)[1]
# PlotMAPlotCounts(de.results=res.o, counts.dataframe=normExprData, design.matrix=desMat,
# show.plot.flag=TRUE, plotly.flag=TRUE, save.plot=FALSE, prefix.plot=names(rescList1)[1], threshold=0.05)
ddetable
## de nde
## WTSD5 - WTHC5 4823 9708
## WTRS2 - WTHC7 3379 11152
## KOHC5 - WTHC5 11 14520
## KOHC7 - WTHC7 14 14517